# Construct Population Variables

##
## Population in general (Penn World Tables)
##
penn <- read.csv(file = "Daten/Penn World Tables/penntables.csv", header = T, sep = ",")
penn$actorid <- countrycode(penn$country.isocode, "iso3c", "cown") 
penn$population_log <- log(penn$POP)
penn <- subset(penn, select = c(actorid, year, population_log))
Master <- join(Master, penn, type="left", by=c("actorid", "year"), match="all")


## 
## Rural population (World Bank World Development Indicator)
## and: Population density
##

wdilist <- c("SP.RUR.TOTL",    # Rural population
             "SP.RUR.TOTL.ZS", # Rural population (% of total population)
             "EN.POP.DNST")    # Population density

# Extract latest version of desired variables from WDI.
wdi <- WDI(country="all", indicator = wdilist, extra = FALSE,
           start = 1989, end = 2009)

wdi$actorid <- countrycode(wdi$iso2c, "iso2c", "cown", warn = T)
wdi$actorid[wdi$country=="Kosovo"] <- 347 # Kosovo 
wdi$actorid[wdi$country=="Serbia"] <- 345 # Serbia (Yugoslavia)
wdi$actorid[wdi$country=="South Sudan"] <- 626 # South Sudan (to be sure that it does not match Sudan)
wdi$actorid[wdi$country=="Yemen, Rep."] <- 678 # used in SVAC for Yemen (In CoW: Yemen Arab Republic!)
wdi$actorid[wdi$country=="South Africa"] <- 560 # South Africa
wdi$actorid[wdi$country=="Zimbabwe"] <- 552 # Zimbabwe

wdi$wdi.rural.log <- log(wdi$SP.RUR.TOTL)
wdi$wdi.rural.percentage.log <- log(wdi$SP.RUR.TOTL.ZS)
wdi$wdi.pop.density.log <- log(wdi$EN.POP.DNST) 
wdi$wdi.rural.log[is.infinite(wdi$wdi.rural.log)] <- 0
wdi$wdi.rural.percentage.log[is.infinite(wdi$wdi.rural.percentage.log)] <- 0
wdi$wdi.pop.density.log[is.infinite(wdi$wdi.pop.density.log)] <- 0

wdi <- subset(wdi, is.na(actorid)==FALSE, 
              select=c(actorid, year, wdi.rural.log, wdi.rural.percentage.log, wdi.pop.density.log))

# Join to Master dataset
Master <- join(Master, wdi, by = c("actorid", "year"), type = "left", match = "all")

rm(penn, wdi)


##
## Rural Population (UNSD)
## PROBLEM: ES SIND EINIGE COUNTRY-YEAR PAARE DOPPELT VORANDEN, ABER MIT UNTERSCHIEDLICHEN POP-ZAHLEN (Ex.: Iran, 1994)
## PROBLEM 2: VIEL ZU WENIG FÄLLE
##

# rural <- read.csv(file = "Daten/UNSD Rural Population/RuralPopulation.csv", header = T, sep = ",")
# rural <- subset(rural, rural$Area=="Rural" & rural$Sex=="Both Sexes" & rural$Year > 1988 & rural$Year < 2010, c(Country.or.Area, Year, Value))
# 
# rural$actorid <- countrycode(rural$Country.or.Area, "country.name", "cown", warn = T)
# rural[rural$Country.or.Area=="Republic of South Sudan",]$actorid <- 626
# 
# rural$rural_population_log <- log(rural$Value)
# rural <- subset(rural, !is.na(rural$actorid), c(actorid, Year, rural_population_log))
# rural <- rename(rural, c("Year"="year"))
# rural <- subset(rural, !duplicated(rural))
# Master <- join(Master, rural, type="left", by=c("actorid", "year"), match="all")


# Extract latest version of population density from WDI and at it directly to state-only dataset
# wdi <- WDI(country="all", indicator = "EN.POP.DNST", extra = FALSE,
#            start = 1989, end = 2009)
# 
# wdi$actorid <- countrycode(wdi$iso2c, "iso2c", "cown", warn = T)
# wdi$actorid[wdi$country=="Kosovo"] <- 347 # Kosovo 
# wdi$actorid[wdi$country=="Serbia"] <- 345 # Serbia (Yugoslavia)
# wdi$actorid[wdi$country=="South Sudan"] <- 626 # South Sudan (to be sure that it does not match Sudan)
# wdi$actorid[wdi$country=="Yemen, Rep."] <- 678 # used in SVAC for Yemen (In CoW: Yemen Arab Republic!)
# wdi$actorid[wdi$country=="South Africa"] <- 560 # South Africa
# wdi$actorid[wdi$country=="Zimbabwe"] <- 552 # Zimbabwe
# 
# wdi$wdi.pop.density.log <- log(wdi$EN.POP.DNST) 
# wdi$wdi.pop.density.log[is.infinite(wdi$wdi.pop.density.log)] <- 0
# 
# wdi <- subset(wdi, is.na(actorid)==FALSE, 
#               select=c(actorid, year, wdi.pop.density.log))
# 
# # Join to Master dataset
# states <- join(states, wdi, by = c("actorid", "year"), type = "left", match = "all")
nrow(states[is.na(states$wdi.pop.density.log),])
